Use this guide to load HTML data data
library(rvest)
wiki_url <- read_html("http://wiki.socr.umich.edu/index.php/SOCR_Data_2011_US_JobsRanking")
html_nodes(wiki_url, "#content")
## {xml_nodeset (1)}
## [1] <div id="content" class="mw-body" role="main">\n\t\t\t<a id="top"></a>\n\ ...
js <- html_table(html_nodes(wiki_url, "table")[[1]])
js<-js[, -1]Focus on the Description feature. Replace all underscore characters “_” with spaces
library(tm)
## Loading required package: NLP
js_corpus<-Corpus(VectorSource(js$Description))
corpus_clean<-tm_map(js_corpus, content_transformer(function(x) gsub(x, pattern = "_", replacement = " ")))
corpus_clean <- tm_map(corpus_clean, removeWords, stopwords("english"))
corpus_clean<-tm_map(corpus_clean, tolower)
inspect(corpus_clean[1:3])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 3
##
## [1] researches designs develops maintains software systems along hardware development medical scientific industrial purposes
## [2] applies mathematical theories formulas teach solve problems business educational industrial climate
## [3] interprets statistics determine probabilities accidents sickness death loss property theft natural disastersSave the data using write.csv() and then use the
read.transactions() in arules package to read
the CSV data file. Visualize the item support using item frequency
plots.
js_corpus1<-Corpus(VectorSource(js$Job_Title))
corpus_clean1<-tm_map(js_corpus1, content_transformer(function(x) gsub(x, pattern = "_", replacement = " ")))
inspect(corpus_clean1[1:3])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 3
##
## [1] Software Engineer Mathematician Actuary
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
js2<-cbind(js,corpus_clean$content,corpus_clean1$content)
js2$`corpus_clean$content`->js2$desc
js2$`corpus_clean1$content`->js2$jt
js2<-js2[,12:13]
js2$desc<-strsplit(js2$desc," ")
js2$desc<-plyr::ldply(js2$desc, rbind)
write.csv(js2, "jobsat.csv", row.names=F)
library(arules)
## Warning: package 'arules' was built under R version 4.2.3
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:tm':
##
## inspect
## The following objects are masked from 'package:base':
##
## abbreviate, write
js3<-read.transactions("jobsat.csv", sep = ",", skip = 1,rm.duplicates=TRUE)
## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string
## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string
## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string
## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string
## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string
## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string
## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string
## Warning in scan(text = l, what = "character", sep = sep, quote = quote, : EOF
## within quoted string
## distribution of transactions with duplicates:
## items
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 19 20
## 3 1 2 3 6 11 7 11 13 16 17 19 24 20 23 9 4 1 2
summary(js3)
## transactions as itemMatrix in sparse format with
## 200 rows (elements/itemsets/transactions) and
## 1241 columns (items) and a density of 0.007973409
##
## most frequent items:
## repairs prepares assists operates studies (Other)
## 19 15 14 13 13 1905
##
## element (itemset/transaction) length distribution:
## sizes
## 2 5 6 7 8 9 10 11 12 13 14 15 16 17 19
## 3 3 6 13 41 29 29 28 18 13 7 4 3 2 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 8.000 10.000 9.895 11.000 19.000
##
## includes extended item information - examples:
## labels
## 1 --date
## 2 (heart
## 3 16-wheeled
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
mat <- as.matrix(js3@data)
rowNames <- js3@itemInfo$labels
colNames <- paste0("S", c(1:dim(mat)[2]))
rownames(mat) <- rowNames
colnames(mat) <- colNames
df <- as.data.frame(1*mat)
df$avg <- rowMeans(df)
dfOrdered <- df[order(df$avg, decreasing = T), ]
matOrdered <- as.matrix(dfOrdered)
# track the ordered row names
rowNames <- rownames(dfOrdered)
colNames <- colnames(dfOrdered)
plot_ly(x = reorder(rowNames[c(1:20)], -dfOrdered[1:20, "avg"]), y=dfOrdered[1:20, "avg"], name="Top 20 Terms", type="bar") %>%
layout(title='Frequency of Job Satisfaction (Top 20) based on averaging across Cases',
xaxis = list(title="Term"),
yaxis = list(title="Relative Frequency"))
Generate the sparse terms matrix for each job category. What terms appear as more popular?
plot_ly(x=rowNames[c(1:20)], y=js2$jt[1:50], z=matOrdered[1:50, 1:20], type="heatmap") %>%
layout(title='Heatmap - Top-20 Terms for the first 50 Job Title') %>% hide_colorbar()
## "repairs" is the most common terms in the first 50 Job Title selected.Fit a model:
myrules<-apriori(data=jobs,parameter=list(support=0.1,confidence=0.8,minlen=1)).
Try out several rule thresholds trading off gain and accuracy
apriori(js3, parameter=list(support=0.1, confidence=0.8, minlen=1))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.1 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 20
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[1241 item(s), 200 transaction(s)] done [0.00s].
## sorting and recoding items ... [0 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 done [0.00s].
## writing ... [0 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## set of 0 rules
## Default have no rule
apriori(js3, parameter=list(support=0.01, confidence=0.8, minlen=1))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.01 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 2
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[1241 item(s), 200 transaction(s)] done [0.00s].
## sorting and recoding items ... [299 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [197 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## set of 197 rules
## Lower support value resulted in 189 rules
apriori(js3, parameter=list(support=0.1, confidence=0.025, minlen=1))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.025 0.1 1 none FALSE TRUE 5 0.1 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 20
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[1241 item(s), 200 transaction(s)] done [0.00s].
## sorting and recoding items ... [0 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 done [0.00s].
## writing ... [0 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## set of 0 rules
## Lower confidence value resulted in no rule
apriori(js3, parameter=list(support=0.1, confidence=0.8, minlen=3))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.1 3
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 20
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[1241 item(s), 200 transaction(s)] done [0.00s].
## sorting and recoding items ... [0 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 done [0.00s].
## writing ... [0 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## set of 0 rules
## Increased minlen value resulted in no rule
apriori(js3, parameter=list(support=0.01, confidence=0.25, minlen=1))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.25 0.1 1 none FALSE TRUE 5 0.01 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 2
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[1241 item(s), 200 transaction(s)] done [0.00s].
## sorting and recoding items ... [299 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [490 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## set of 490 rules
## Lower support and confidence resulted in 462 rules
jsr<-apriori(js3, parameter=list(support=0.01, confidence=0.25, minlen=3))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.25 0.1 1 none FALSE TRUE 5 0.01 3
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 2
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[1241 item(s), 200 transaction(s)] done [0.00s].
## sorting and recoding items ... [299 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [163 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## Increased minlen value on the current support and confidence value resulted in lower rules of 163
jsr
## set of 163 rules
## the rules have to at least 1% of the job title in the study and have at least 25% accuracy. Moreover, minlen=3 removes all rules that have fewer than three items.Evaluate model performance with lift
summary(jsr)
## set of 163 rules
##
## rule length distribution (lhs + rhs):sizes
## 3 4
## 147 16
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 3.000 3.000 3.098 3.000 4.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.01000 Min. :0.4000 Min. :0.01000 Min. : 5.263
## 1st Qu.:0.01000 1st Qu.:0.7083 1st Qu.:0.01000 1st Qu.: 18.182
## Median :0.01000 Median :1.0000 Median :0.01000 Median : 33.333
## Mean :0.01028 Mean :0.8927 Mean :0.01224 Mean : 44.273
## 3rd Qu.:0.01000 3rd Qu.:1.0000 3rd Qu.:0.01500 3rd Qu.: 66.667
## Max. :0.01500 Max. :1.0000 Max. :0.02500 Max. :100.000
## count
## Min. :2.000
## 1st Qu.:2.000
## Median :2.000
## Mean :2.055
## 3rd Qu.:2.000
## Max. :3.000
##
## mining info:
## data ntransactions support confidence
## js3 200 0.01 0.25
## call
## apriori(data = js3, parameter = list(support = 0.01, confidence = 0.25, minlen = 3))
#We have 147 rules that contain 3 items; 16 rules containing 4 items.
library(arulesViz)
## Warning: package 'arulesViz' was built under R version 4.2.3
sortedRule <- sort(jsr)
x1 <- sortedRule@quality$support
y1 <- sortedRule@quality$confidence
z1 <- sortedRule@quality$lift
col1 <- sortedRule@quality$count
ruleNames <- paste0("Rule", c(1:length(sortedRule@quality$support)))
plot_ly(x = ~x1, y = ~y1, z = ~z1, color = ~z1, name=ruleNames) %>%
add_markers() %>%
layout(title=paste0("Arule Support-Confidence-Lift Plot (for all ", length(sortedRule@quality$support), " rules)"),
scene = list(xaxis = list(title = 'Support'),
yaxis = list(title = 'Confidence'),
zaxis = list(title = 'Lift'))) %>%
hide_colorbar()
inspect(head(jsr))
## lhs rhs support confidence coverage lift
## [1] {fire, protects} => {property} 0.01 1 0.01 40
## [2] {property, protects} => {fire} 0.01 1 0.01 100
## [3] {fire, property} => {protects} 0.01 1 0.01 100
## [4] {events, newspapers} => {newsworthy} 0.01 1 0.01 100
## [5] {events, newsworthy} => {newspapers} 0.01 1 0.01 100
## [6] {newspapers, newsworthy} => {events} 0.01 1 0.01 100
## count
## [1] 2
## [2] 2
## [3] 2
## [4] 2
## [5] 2
## [6] 2
# Using the first observation as example: if job term involve fire or protects, it is likely that the job term involve property as there are assosication between fire, protects and property.Try to improve the model performance
jsrp<- jsr[!is.redundant(jsr, measure="lift")]
jsrp
## set of 147 rules
# Remove redundant rules resulted in 147 rules.Sort the set of association rules
inspect(head(sort(jsrp, by="lift")))
## lhs rhs support confidence coverage lift
## [1] {property, protects} => {fire} 0.01 1 0.01 100
## [2] {fire, property} => {protects} 0.01 1 0.01 100
## [3] {events, newspapers} => {newsworthy} 0.01 1 0.01 100
## [4] {events, newsworthy} => {newspapers} 0.01 1 0.01 100
## [5] {newspapers, newsworthy} => {events} 0.01 1 0.01 100
## [6] {events, magazines} => {newspapers} 0.01 1 0.01 100
## count
## [1] 2
## [2] 2
## [3] 2
## [4] 2
## [5] 2
## [6] 2
## Sorted bt lift:
## with unique of job description, the lift values in generally are high, often expect the terms to be together.
## if job term involve property or protects, it is likely that the job term involve fire as there are assosication between fire, protects and property. Although Some of these terms are interchangeable between LHS and RHS, Slightly difference with model before sorting the lift as fire + protects related with propertywith lower lift.
## if job term involve events or newspaper, it is likely that the job term involve newsworthy as there are assosication between events, newspaper and newsworthy.Some of these terms are interchangeable between LHS and RHS with same lift.Investigate associations that may be linked to a specific job-description terms.
## Use "repairs" as example:
rrules<-subset(jsrp, items %in% "repairs")
inspect(rrules)
## lhs rhs support confidence coverage
## [1] {residential, systems} => {repairs} 0.010 1.0000000 0.010
## [2] {repairs, residential} => {systems} 0.010 1.0000000 0.010
## [3] {repairs, systems} => {residential} 0.010 0.5000000 0.020
## [4] {commercial, residential} => {repairs} 0.010 0.5000000 0.020
## [5] {repairs, residential} => {commercial} 0.010 1.0000000 0.010
## [6] {commercial, repairs} => {residential} 0.010 0.6666667 0.015
## [7] {maintenance, performs} => {repairs} 0.015 0.6000000 0.025
## [8] {maintenance, repairs} => {performs} 0.015 1.0000000 0.015
## [9] {performs, repairs} => {maintenance} 0.015 0.7500000 0.020
## [10] {installs, systems} => {repairs} 0.010 0.6666667 0.015
## [11] {installs, repairs} => {systems} 0.010 0.4000000 0.025
## [12] {repairs, systems} => {installs} 0.010 0.5000000 0.020
## [13] {commercial, systems} => {repairs} 0.010 1.0000000 0.010
## [14] {repairs, systems} => {commercial} 0.010 0.5000000 0.020
## [15] {commercial, repairs} => {systems} 0.010 0.6666667 0.015
## [16] {industrial, systems} => {repairs} 0.010 0.6666667 0.015
## [17] {repairs, systems} => {industrial} 0.010 0.5000000 0.020
## [18] {industrial, repairs} => {systems} 0.010 0.6666667 0.015
## lift count
## [1] 10.526316 2
## [2] 25.000000 2
## [3] 20.000000 2
## [4] 5.263158 2
## [5] 22.222222 2
## [6] 26.666667 2
## [7] 6.315789 3
## [8] 20.000000 3
## [9] 21.428571 3
## [10] 7.017544 2
## [11] 10.000000 2
## [12] 11.111111 2
## [13] 10.526316 2
## [14] 11.111111 2
## [15] 16.666667 2
## [16] 7.017544 2
## [17] 9.090909 2
## [18] 16.666667 2
plot(rrules, method="graph", measure = "support", engine="htmlwidget",
shading = "lift", control = list(verbose = TRUE,main = list(title="Grouped Matrix for the 4 repairs-associated Rules")))
## Warning: Unknown control parameters: main
## Available control parameters (with default values):
## itemCol = #CBD2FC
## nodeCol = c("#EE0000", "#EE0303", "#EE0606", "#EE0909", "#EE0C0C", "#EE0F0F", "#EE1212", "#EE1515", "#EE1818", "#EE1B1B", "#EE1E1E", "#EE2222", "#EE2525", "#EE2828", "#EE2B2B", "#EE2E2E", "#EE3131", "#EE3434", "#EE3737", "#EE3A3A", "#EE3D3D", "#EE4040", "#EE4444", "#EE4747", "#EE4A4A", "#EE4D4D", "#EE5050", "#EE5353", "#EE5656", "#EE5959", "#EE5C5C", "#EE5F5F", "#EE6262", "#EE6666", "#EE6969", "#EE6C6C", "#EE6F6F", "#EE7272", "#EE7575", "#EE7878", "#EE7B7B", "#EE7E7E", "#EE8181", "#EE8484", "#EE8888", "#EE8B8B", "#EE8E8E", "#EE9191", "#EE9494", "#EE9797", "#EE9999", "#EE9B9B", "#EE9D9D", "#EE9F9F", "#EEA0A0", "#EEA2A2", "#EEA4A4", "#EEA5A5", "#EEA7A7", "#EEA9A9", "#EEABAB", "#EEACAC", "#EEAEAE", "#EEB0B0", "#EEB1B1", "#EEB3B3", "#EEB5B5", "#EEB7B7", "#EEB8B8", "#EEBABA", "#EEBCBC", "#EEBDBD", "#EEBFBF", "#EEC1C1", "#EEC3C3", "#EEC4C4", "#EEC6C6", "#EEC8C8", "#EEC9C9", "#EECBCB", "#EECDCD", "#EECFCF", "#EED0D0", "#EED2D2", "#EED4D4", "#EED5D5", "#EED7D7", "#EED9D9", "#EEDBDB", "#EEDCDC", "#EEDEDE", "#EEE0E0", "#EEE1E1", "#EEE3E3", "#EEE5E5", "#EEE7E7", "#EEE8E8", "#EEEAEA", "#EEECEC", "#EEEEEE")
## precision = 3
## igraphLayout = layout_nicely
## interactive = TRUE
## engine = visNetwork
## max = 100
## selection_menu = TRUE
## degree_highlight = 1
## verbose = FALSE
## Used control parameters:
## itemCol = #CBD2FC
## nodeCol = c("#EE0000", "#EE0303", "#EE0606", "#EE0909", "#EE0C0C", "#EE0F0F", "#EE1212", "#EE1515", "#EE1818", "#EE1B1B", "#EE1E1E", "#EE2222", "#EE2525", "#EE2828", "#EE2B2B", "#EE2E2E", "#EE3131", "#EE3434", "#EE3737", "#EE3A3A", "#EE3D3D", "#EE4040", "#EE4444", "#EE4747", "#EE4A4A", "#EE4D4D", "#EE5050", "#EE5353", "#EE5656", "#EE5959", "#EE5C5C", "#EE5F5F", "#EE6262", "#EE6666", "#EE6969", "#EE6C6C", "#EE6F6F", "#EE7272", "#EE7575", "#EE7878", "#EE7B7B", "#EE7E7E", "#EE8181", "#EE8484", "#EE8888", "#EE8B8B", "#EE8E8E", "#EE9191", "#EE9494", "#EE9797", "#EE9999", "#EE9B9B", "#EE9D9D", "#EE9F9F", "#EEA0A0", "#EEA2A2", "#EEA4A4", "#EEA5A5", "#EEA7A7", "#EEA9A9", "#EEABAB", "#EEACAC", "#EEAEAE", "#EEB0B0", "#EEB1B1", "#EEB3B3", "#EEB5B5", "#EEB7B7", "#EEB8B8", "#EEBABA", "#EEBCBC", "#EEBDBD", "#EEBFBF", "#EEC1C1", "#EEC3C3", "#EEC4C4", "#EEC6C6", "#EEC8C8", "#EEC9C9", "#EECBCB", "#EECDCD", "#EECFCF", "#EED0D0", "#EED2D2", "#EED4D4", "#EED5D5", "#EED7D7", "#EED9D9", "#EEDBDB", "#EEDCDC", "#EEDEDE", "#EEE0E0", "#EEE1E1", "#EEE3E3", "#EEE5E5", "#EEE7E7", "#EEE8E8", "#EEEAEA", "#EEECEC", "#EEEEEE")
## precision = 3
## igraphLayout = layout_nicely
## interactive = TRUE
## engine = htmlwidget
## max = 100
## selection_menu = TRUE
## degree_highlight = 1
## verbose = TRUE
plot(sort(rrules, by="lift"), method="grouped", k = 2, control=list(type="items"), engine = "htmlwidget")
## Warning: Unknown control parameters: type
## Available control parameters (with default values):
## k = 20
## aggr.fun = function (x, ...) UseMethod("mean")
## rhs_max = 10
## lhs_label_items = 2
## col = c("#EE0000FF", "#EEEEEEFF")
## groups = NULL
## engine = ggplot2
## verbose = FALSE
plot(rrules, method = "matrix", engine = "htmlwidget")